Primary exercises

Apply the following to survey data:

  1. Select personal information {name, age, gender, height} into a new tibble survey_personal_info.
survey_personal_info <- select(survey, name, age, gender, height)
  1. Select personal information as previous exercise into a new tibble survey_personal_info but with variable names initials in uppercase, e.g. Name, Age etc.
survey_personal_info <- select(survey, Name=name, Age=age, Gender=gender, Height=height)
  1. Reorder the variables in survey dataset as such that name,age and gender appear as first, second and the third column followed by the remaining variables.
select(survey, name,age,gender,everything())
# A tibble: 233 × 13
   name    age gender span1 span2 hand  fold  pulse clap  exerc…¹ smokes height m.i  
   <chr> <dbl> <chr>  <dbl> <dbl> <chr> <chr> <dbl> <chr> <chr>   <chr>   <dbl> <chr>
 1 Alys…  18.2 female  18.5  18   right right    92 left  some    never    173  metr…
 2 Todd   17.6 male    19.5  20.5 left  right   104 left  none    regul    178. impe…
 3 Gera…  16.9 male    18    13.3 right left     87 neit… none    occas     NA  <NA> 
 4 Robe…  20.3 male    18.8  18.9 right right    NA neit… none    never    160  metr…
 5 Dust…  23.7 male    20    20   right neit…    35 right some    never    165  metr…
 6 Abby   21   female  18    17.7 right left     64 right some    never    173. impe…
 7 Andre  18.8 male    17.7  17.7 right left     83 right freq    never    183. impe…
 8 Mich…  35.8 female  17    17.3 right right    74 right freq    never    157  metr…
 9 Edwa…  19   male    20    19.5 right right    72 right some    never    175  metr…
10 Carl   22.3 male    18.5  18.5 right right    90 right some    never    167  metr…
# … with 223 more rows, and abbreviated variable name ¹​exercise
  1. Deselect variables that relate to hand and/or arm (e.g. span1, span2, hand, etc.). See also description survey data.
select(survey, -span1,-span2,-hand,-fold,-clap)
# A tibble: 233 × 8
   name    gender pulse exercise smokes height m.i        age
   <chr>   <chr>  <dbl> <chr>    <chr>   <dbl> <chr>    <dbl>
 1 Alyson  female    92 some     never    173  metric    18.2
 2 Todd    male     104 none     regul    178. imperial  17.6
 3 Gerald  male      87 none     occas     NA  <NA>      16.9
 4 Robert  male      NA none     never    160  metric    20.3
 5 Dustin  male      35 some     never    165  metric    23.7
 6 Abby    female    64 some     never    173. imperial  21  
 7 Andre   male      83 freq     never    183. imperial  18.8
 8 Michael female    74 freq     never    157  metric    35.8
 9 Edward  male      72 some     never    175  metric    19  
10 Carl    male      90 some     never    167  metric    22.3
# … with 223 more rows
  1. Select the top 20 names along with gender.
# 1)
survey_sub <- select(survey, name,gender)
head( survey_sub , 20)
# A tibble: 20 × 2
   name    gender
   <chr>   <chr> 
 1 Alyson  female
 2 Todd    male  
 3 Gerald  male  
 4 Robert  male  
 5 Dustin  male  
 6 Abby    female
 7 Andre   male  
 8 Michael female
 9 Edward  male  
10 Carl    male  
11 Noemi   female
12 Alfred  male  
13 Bernice female
14 Velma   female
15 Eddie   male  
16 Fern    female
17 Carolyn female
18 Virgil  male  
19 Ken     male  
20 Richard male  
# 2) shorter solution without intermediate variable 'survey_sub' :
head(select(survey,name),20)
# A tibble: 20 × 1
   name   
   <chr>  
 1 Alyson 
 2 Todd   
 3 Gerald 
 4 Robert 
 5 Dustin 
 6 Abby   
 7 Andre  
 8 Michael
 9 Edward 
10 Carl   
11 Noemi  
12 Alfred 
13 Bernice
14 Velma  
15 Eddie  
16 Fern   
17 Carolyn
18 Virgil 
19 Ken    
20 Richard
  1. Reproduce the following tibbles (note that variables are renamed and reshuffled):

    6.1 First 5 observations.

    # Remark: by enclosing select(...) as the first argument of 'head' function you 
    # can avoid creating intermediate variables.
    
    head( select(survey, SPAN1=span1, SPAN2=span2, everything()), 5)
    # A tibble: 5 × 13
      SPAN1 SPAN2 name   gender hand  fold  pulse clap  exerc…¹ smokes height m.i     age
      <dbl> <dbl> <chr>  <chr>  <chr> <chr> <dbl> <chr> <chr>   <chr>   <dbl> <chr> <dbl>
    1  18.5  18   Alyson female right right    92 left  some    never    173  metr…  18.2
    2  19.5  20.5 Todd   male   left  right   104 left  none    regul    178. impe…  17.6
    3  18    13.3 Gerald male   right left     87 neit… none    occas     NA  <NA>   16.9
    4  18.8  18.9 Robert male   right right    NA neit… none    never    160  metr…  20.3
    5  20    20   Dustin male   right neit…    35 right some    never    165  metr…  23.7
    # … with abbreviated variable name ¹​exercise

    6.1 Last 3 observations.

    tail( select(survey, Hand=hand,Fold=fold,Clap=clap, everything()), 3)
    # A tibble: 3 × 13
      Hand  Fold  Clap  name   gender span1 span2 pulse exerc…¹ smokes height m.i     age
      <chr> <chr> <chr> <chr>  <chr>  <dbl> <dbl> <dbl> <chr>   <chr>   <dbl> <chr> <dbl>
    1 right right right Tracey female  17.5  16.5    NA some    never    170  metr…  18.6
    2 right right right Keith  male    21    21.5    90 some    never    183  metr…  17.2
    3 right right right Celina female  17.6  17.3    85 freq    never    168. metr…  17.8
    # … with abbreviated variable name ¹​exercise

Extra exercises

  1. Rename the m.i variable to system.
# 1) Very tedious, you need to type all the variable names and 
# only rename the 'm.i' variable to 'system'
#
select(survey , name, gender, span1, span2, hand, fold, pulse, clap, 
       exercise, smokes, height, system=m.i, age)
# A tibble: 233 × 13
   name    gender span1 span2 hand  fold    pulse clap   exerc…¹ smokes height system
   <chr>   <chr>  <dbl> <dbl> <chr> <chr>   <dbl> <chr>  <chr>   <chr>   <dbl> <chr> 
 1 Alyson  female  18.5  18   right right      92 left   some    never    173  metric
 2 Todd    male    19.5  20.5 left  right     104 left   none    regul    178. imper…
 3 Gerald  male    18    13.3 right left       87 neith… none    occas     NA  <NA>  
 4 Robert  male    18.8  18.9 right right      NA neith… none    never    160  metric
 5 Dustin  male    20    20   right neither    35 right  some    never    165  metric
 6 Abby    female  18    17.7 right left       64 right  some    never    173. imper…
 7 Andre   male    17.7  17.7 right left       83 right  freq    never    183. imper…
 8 Michael female  17    17.3 right right      74 right  freq    never    157  metric
 9 Edward  male    20    19.5 right right      72 right  some    never    175  metric
10 Carl    male    18.5  18.5 right right      90 right  some    never    167  metric
# … with 223 more rows, 1 more variable: age <dbl>, and abbreviated variable name
#   ¹​exercise
# 2) Shorter but side-effect is that m.i (now system) comes at 
# the front.
select(survey, system=m.i, everything()) 
# A tibble: 233 × 13
   system   name    gender span1 span2 hand  fold   pulse clap  exerc…¹ smokes height
   <chr>    <chr>   <chr>  <dbl> <dbl> <chr> <chr>  <dbl> <chr> <chr>   <chr>   <dbl>
 1 metric   Alyson  female  18.5  18   right right     92 left  some    never    173 
 2 imperial Todd    male    19.5  20.5 left  right    104 left  none    regul    178.
 3 <NA>     Gerald  male    18    13.3 right left      87 neit… none    occas     NA 
 4 metric   Robert  male    18.8  18.9 right right     NA neit… none    never    160 
 5 metric   Dustin  male    20    20   right neith…    35 right some    never    165 
 6 imperial Abby    female  18    17.7 right left      64 right some    never    173.
 7 imperial Andre   male    17.7  17.7 right left      83 right freq    never    183.
 8 metric   Michael female  17    17.3 right right     74 right freq    never    157 
 9 metric   Edward  male    20    19.5 right right     72 right some    never    175 
10 metric   Carl    male    18.5  18.5 right right     90 right some    never    167 
# … with 223 more rows, 1 more variable: age <dbl>, and abbreviated variable name
#   ¹​exercise
# 3) Use rename function (see ?dplyr::rename). 
rename(survey,system=m.i)
# A tibble: 233 × 13
   name    gender span1 span2 hand  fold    pulse clap   exerc…¹ smokes height system
   <chr>   <chr>  <dbl> <dbl> <chr> <chr>   <dbl> <chr>  <chr>   <chr>   <dbl> <chr> 
 1 Alyson  female  18.5  18   right right      92 left   some    never    173  metric
 2 Todd    male    19.5  20.5 left  right     104 left   none    regul    178. imper…
 3 Gerald  male    18    13.3 right left       87 neith… none    occas     NA  <NA>  
 4 Robert  male    18.8  18.9 right right      NA neith… none    never    160  metric
 5 Dustin  male    20    20   right neither    35 right  some    never    165  metric
 6 Abby    female  18    17.7 right left       64 right  some    never    173. imper…
 7 Andre   male    17.7  17.7 right left       83 right  freq    never    183. imper…
 8 Michael female  17    17.3 right right      74 right  freq    never    157  metric
 9 Edward  male    20    19.5 right right      72 right  some    never    175  metric
10 Carl    male    18.5  18.5 right right      90 right  some    never    167  metric
# … with 223 more rows, 1 more variable: age <dbl>, and abbreviated variable name
#   ¹​exercise
  1. Select name along with all categorical variables into a new tibble survey_cats.
# Categrical data: variables which take on categories as values, e.g. 
# 
# gender   : {male, female}
# hand     : {left, right}
# fold     : {left, right, neither} 
# clap     : {left, right, neither} 
# exercise : {freq, some, none}
# smokes   : {heavy, regul, occas, never}
# m.i      : {metric, imperial}
#
# 
survey_cats <- select(survey, name, gender, hand, fold, clap, exercise, smokes, m.i)
survey_cats
# A tibble: 233 × 8
   name    gender hand  fold    clap    exercise smokes m.i     
   <chr>   <chr>  <chr> <chr>   <chr>   <chr>    <chr>  <chr>   
 1 Alyson  female right right   left    some     never  metric  
 2 Todd    male   left  right   left    none     regul  imperial
 3 Gerald  male   right left    neither none     occas  <NA>    
 4 Robert  male   right right   neither none     never  metric  
 5 Dustin  male   right neither right   some     never  metric  
 6 Abby    female right left    right   some     never  imperial
 7 Andre   male   right left    right   freq     never  imperial
 8 Michael female right right   right   freq     never  metric  
 9 Edward  male   right right   right   some     never  metric  
10 Carl    male   right right   right   some     never  metric  
# … with 223 more rows
  1. Create a new tibble survey_nums with name and all numerical variables.
survey_nums <- select(survey, name, span1, span2, pulse, height, age)
survey_nums
# A tibble: 233 × 6
   name    span1 span2 pulse height   age
   <chr>   <dbl> <dbl> <dbl>  <dbl> <dbl>
 1 Alyson   18.5  18      92   173   18.2
 2 Todd     19.5  20.5   104   178.  17.6
 3 Gerald   18    13.3    87    NA   16.9
 4 Robert   18.8  18.9    NA   160   20.3
 5 Dustin   20    20      35   165   23.7
 6 Abby     18    17.7    64   173.  21  
 7 Andre    17.7  17.7    83   183.  18.8
 8 Michael  17    17.3    74   157   35.8
 9 Edward   20    19.5    72   175   19  
10 Carl     18.5  18.5    90   167   22.3
# … with 223 more rows
  1. For this exercise you’ll need an additional helper function where explained
    here.

    4.1 Reproduce the result from the previous exercise (3) without dictating all numerical variable names. Hint: you’ll also need is.numeric function (see ?is.numeric for help).

    bind_cols(survey['name'], select(survey, where(is.numeric)))
    # A tibble: 233 × 6
       name    span1 span2 pulse height   age
       <chr>   <dbl> <dbl> <dbl>  <dbl> <dbl>
     1 Alyson   18.5  18      92   173   18.2
     2 Todd     19.5  20.5   104   178.  17.6
     3 Gerald   18    13.3    87    NA   16.9
     4 Robert   18.8  18.9    NA   160   20.3
     5 Dustin   20    20      35   165   23.7
     6 Abby     18    17.7    64   173.  21  
     7 Andre    17.7  17.7    83   183.  18.8
     8 Michael  17    17.3    74   157   35.8
     9 Edward   20    19.5    72   175   19  
    10 Carl     18.5  18.5    90   167   22.3
    # … with 223 more rows

    4.2 Select all non-numerical variables.

    # 1) 
    select(survey,! where(is.numeric))
    # A tibble: 233 × 8
       name    gender hand  fold    clap    exercise smokes m.i     
       <chr>   <chr>  <chr> <chr>   <chr>   <chr>    <chr>  <chr>   
     1 Alyson  female right right   left    some     never  metric  
     2 Todd    male   left  right   left    none     regul  imperial
     3 Gerald  male   right left    neither none     occas  <NA>    
     4 Robert  male   right right   neither none     never  metric  
     5 Dustin  male   right neither right   some     never  metric  
     6 Abby    female right left    right   some     never  imperial
     7 Andre   male   right left    right   freq     never  imperial
     8 Michael female right right   right   freq     never  metric  
     9 Edward  male   right right   right   some     never  metric  
    10 Carl    male   right right   right   some     never  metric  
    # … with 223 more rows
    # 2) Since there are no other non-numerical types the following is also a correct solution.
    select(survey, where(is.character))
    # A tibble: 233 × 8
       name    gender hand  fold    clap    exercise smokes m.i     
       <chr>   <chr>  <chr> <chr>   <chr>   <chr>    <chr>  <chr>   
     1 Alyson  female right right   left    some     never  metric  
     2 Todd    male   left  right   left    none     regul  imperial
     3 Gerald  male   right left    neither none     occas  <NA>    
     4 Robert  male   right right   neither none     never  metric  
     5 Dustin  male   right neither right   some     never  metric  
     6 Abby    female right left    right   some     never  imperial
     7 Andre   male   right left    right   freq     never  imperial
     8 Michael female right right   right   freq     never  metric  
     9 Edward  male   right right   right   some     never  metric  
    10 Carl    male   right right   right   some     never  metric  
    # … with 223 more rows

Selection by pattern matching

In data sets with large number of variables, finding variables will become tedious. Several helper functions are available to speed up the variable name search.

starts_with(), ends_with() and contains()

The functions help to find fixed patterns in variable names:

# select variables starting with character 'a'
select(pulse, starts_with("a"))
# A tibble: 110 × 2
     age alcohol
   <dbl> <chr>  
 1    18 yes    
 2    19 yes    
 3    18 yes    
 4    18 yes    
 5    18 yes    
 6    22 yes    
 7    20 yes    
 8    18 yes    
 9    19 yes    
10    23 yes    
# … with 100 more rows
# select variables ending with 'e'
select(pulse, ends_with("e"))
# A tibble: 110 × 3
   name        age exercise
   <chr>     <dbl> <chr>   
 1 Bonnie       18 moderate
 2 Melanie      19 moderate
 3 Consuelo     18 high    
 4 Travis       18 high    
 5 Lauri        18 low     
 6 George       22 low     
 7 Cherry       20 moderate
 8 Francesca    18 moderate
 9 Sonja        19 high    
10 Troy         23 moderate
# … with 100 more rows
# select variables containing character 'i' 
select(pulse, contains("i"))
# A tibble: 110 × 4
   id     height weight exercise
   <chr>   <dbl>  <dbl> <chr>   
 1 1993_A    173     57 moderate
 2 1993_B    179     58 moderate
 3 1993_C    167     62 high    
 4 1993_D    195     84 high    
 5 1993_E    173     64 low     
 6 1993_F    184     74 low     
 7 1993_G    162     57 moderate
 8 1993_H    169     55 moderate
 9 1993_I    164     56 high    
10 1993_J    168     60 moderate
# … with 100 more rows

The helper functions can be used with logical operators {!,|,&} which will be explained later. You have already encountered one in the lecture on Useful R functions, !, the negation operator. In short it complements the results. For example, above we could select variables which started with character ‘a’ with select(pulse, starts_with("a")) which resulted into a tibble with the two variables age and alcohol. Using ! in front of the helper function in the expression will produce the complement of the previous result, namely all variables that do not start with a:

select(pulse, ! starts_with("a"))
# A tibble: 110 × 11
   id     name      height weight gender smokes exercise ran   pulse1 pulse2  year
   <chr>  <chr>      <dbl>  <dbl> <chr>  <chr>  <chr>    <chr>  <dbl>  <dbl> <dbl>
 1 1993_A Bonnie       173     57 female no     moderate sat       86     88  1993
 2 1993_B Melanie      179     58 female no     moderate ran       82    150  1993
 3 1993_C Consuelo     167     62 female no     high     ran       96    176  1993
 4 1993_D Travis       195     84 male   no     high     sat       71     73  1993
 5 1993_E Lauri        173     64 female no     low      sat       90     88  1993
 6 1993_F George       184     74 male   no     low      ran       78    141  1993
 7 1993_G Cherry       162     57 female no     moderate sat       68     72  1993
 8 1993_H Francesca    169     55 female no     moderate sat       71     77  1993
 9 1993_I Sonja        164     56 female no     high     sat       68     68  1993
10 1993_J Troy         168     60 male   no     moderate ran       88    150  1993
# … with 100 more rows

Note that age and alcohol do not occur in the result.

There are several other helper functions which fall beyond the scope of this lecture, visit here for more details.

  1. Select variables, from survey data, by pattern matching.

    5.1 Select variables that end with ‘e’.

    select(survey, ends_with('e'))
    # A tibble: 233 × 4
       name    pulse exercise   age
       <chr>   <dbl> <chr>    <dbl>
     1 Alyson     92 some      18.2
     2 Todd      104 none      17.6
     3 Gerald     87 none      16.9
     4 Robert     NA none      20.3
     5 Dustin     35 some      23.7
     6 Abby       64 some      21  
     7 Andre      83 freq      18.8
     8 Michael    74 freq      35.8
     9 Edward     72 some      19  
    10 Carl       90 some      22.3
    # … with 223 more rows

    5.2 Select variables that start with ‘s’.

    select(survey, starts_with('s'))
    # A tibble: 233 × 3
       span1 span2 smokes
       <dbl> <dbl> <chr> 
     1  18.5  18   never 
     2  19.5  20.5 regul 
     3  18    13.3 occas 
     4  18.8  18.9 never 
     5  20    20   never 
     6  18    17.7 never 
     7  17.7  17.7 never 
     8  17    17.3 never 
     9  20    19.5 never 
    10  18.5  18.5 never 
    # … with 223 more rows

    5.3 Select hand span variables using a helper function.

    # 1)
    select(survey, contains('span'))
    # A tibble: 233 × 2
       span1 span2
       <dbl> <dbl>
     1  18.5  18  
     2  19.5  20.5
     3  18    13.3
     4  18.8  18.9
     5  20    20  
     6  18    17.7
     7  17.7  17.7
     8  17    17.3
     9  20    19.5
    10  18.5  18.5
    # … with 223 more rows
    # 2) 
    # select(survey, starts_with('span'))


Copyright © 2023 Biomedical Data Sciences (BDS) | LUMC